
#### REPLICATION CODE FOR Company Towns: Single-Industry Dominance and Local Government Capacity

# See README for folder structure
# See CODEBOOK for variable descriptions/sources

# This file produces the figures and tables in the supplemental information 

# Set up + load data ####

library(pacman)
p_load(dplyr, here, tidyr, stringr, did, ggplot2, scales, estimatr, usmap,
       modelsummary, kableExtra, tibble, gridExtra)

dat <- read.csv(here("county_data.csv"))


# A.1: map of coal employment ####

# select plotting variables to merge in
plottab <- select(dat, fips_long, state_name, pct1920)
plottab$fips <- plottab$fips_long
# select states to plot
states <- c("Indiana", "Kentucky", "West Virginia", "Ohio", "Illinois", "Pennsylvania")

# plot 
plot_usmap(regions="counties", data=plottab, values="pct1920",
           include=states) +
  scale_fill_gradient2(low = 'darkblue', mid = 'white', 
                       high = 'darkred', midpoint=0, na.value="transparent", 
                       name="Proportion") +
  ggtitle("Prop. Labor Force in Coal Industry, 1920") +
  theme(text=element_text(size=15), legend.position = "right")


# A.2: Difference in Differences ####
  # setup ####
 # identify treated counties
treated <- c(17055, 17155, 18055, 21013, 21095, 21107, 21121, 21193, 21195,
             21235, 39127, 42063, 54019, 54045, 54081, 54109, 17027, 18021,
             18125, 42059, 54057, 54093)

# create panel version of data 
datb <- dat %>%
  # limit to counties with high or no coal industry
  filter(highest<.05|highest>.2) %>%
  # select and pivot to long: government employment
  select(fips_long, pct1920, firsthigh, county_name, state_name, 
         contains("gov")&ends_with("full"), contains("locgov_emp")) %>%
  pivot_longer(cols=c(-fips_long, -pct1920, -firsthigh, -county_name, -state_name), names_to="year", names_prefix="gov|_full|locgov_emp") %>%
  # select and pivot to long: population
  full_join(dat %>%
              filter(highest<.05|highest>.2) %>%
              select(fips_long,
                     contains("pop")&ends_with("full"), contains(c("Population", "pop197", "pop198", "pop199", "pop2")), -contains("lab")) %>%
              pivot_longer(cols=c(-fips_long), names_to="year", values_to="pop", names_prefix="pop|_full|Population_")
  ) %>%
  # limit to years with employment data 
  filter(year %in% c(1850, 1860, 1870, 1880, 1900, 1910, 1920, 1930, 1940, 
                     1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997,
                     2002, 2007)) %>%
  # recode gov employment variable: set to NA when population missing, and add 1 for logging
  mutate(value = case_when(!is.na(pop)~value+1)) %>%
  # set year to numeric
  mutate(year = as.numeric(year)) %>%
  # set first year of high coal employment to 0 if never high coal employment (for did package)
  mutate(firsthigh = case_when(!is.na(firsthigh)~firsthigh,
                               T~0)) %>%
  # indicator for post-treatment
  mutate(post = case_when(firsthigh==0~0,
                          year>firsthigh~1,
                          T~0)) %>%
  # limit to counties with never high coal employment or in treated group
  filter(firsthigh==0|fips_long %in% treated)

# calculate residualized government employment measure
# regress logged employment on interaction of population, state, year, only among non-coal counties
mod1 <- lm(log(value)~log(pop)*state_name*factor(year), data=datb[!(datb$fips_long %in% treated),], na.action=na.exclude)
# calculate difference between actual and predicted gov employment
datb$resid_gov <- log(datb$value) - predict(mod1, type="response", newdata = datb)


  # A.2.1: pretrend checks ####

# run diff in diff model
pre_mod <- att_gt(yname="resid_gov",
            tname = "year",
            idname = "fips_long",
            gname = "firsthigh",
            data=datb %>% drop_na(pop) %>% mutate(fips_long = as.numeric(fips_long)),
            weightsname = "pop",
            control = "nevertreated",
            allow_unbalanced_panel = T)
# tidy results
pre_mod <- tidy(pre_mod)
# create indicator for pre-treatment
pre_mod$pre <- ifelse(pre_mod$time<pre_mod$group, 1, 0)

# plot 
dodge <- position_dodge(3)
pre_mod %>%
  filter(pre==1) %>%
  ggplot() +
  geom_point(aes(x=time, y=estimate, color=factor(group)), position=dodge) + 
  geom_errorbar(aes(x=time, ymin=conf.low, ymax=conf.high, color=factor(group)), position=dodge, width=2) +
  theme_bw() + 
  theme(text=element_text(size=20)) +
  xlab("Year") + ylab("Pretreatment Estimate") +
  ggtitle("Pretrend Estimates by Group") + 
  scale_color_manual(name="Treated Year",
                     values=c("gold", "forestgreen", "blue", "purple", "red")) +
  geom_hline(yintercept=0, linetype=2)



  # A.2.2: constructing the government employment measure ####

# create new version of panel data including all counties (not just those in diff in diff)

datb <- dat %>%
  # select and pivot to long: government employment
  select(fips_long, pct1920, county_name, state_name, 
         contains("gov")&ends_with("full"), contains("locgov_emp")) %>%
  pivot_longer(cols=c(-fips_long, -pct1920, -county_name, -state_name), names_to="year", names_prefix="gov|_full|locgov_emp") %>%
  # select and pivot to long: population
  full_join(dat %>%
              select(fips_long,
                     contains("pop")&ends_with("full"), contains(c("Population", "pop197", "pop198", "pop199", "pop2")), -contains("lab")) %>%
              pivot_longer(cols=c(-fips_long), names_to="year", values_to="pop", names_prefix="pop|_full|Population_")
  ) %>%
  # limit to years with employment data 
  filter(year %in% c(1850, 1860, 1870, 1880, 1900, 1910, 1920, 1930, 1940, 
                     1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997,
                     2002, 2007)) %>%
  # recode gov employment variable: set to NA when population missing, and add 1 for logging
  mutate(value = case_when(!is.na(pop)~value+1)) %>%
  # set year to numeric
  mutate(year = as.numeric(year))

# calculate residualized government employment measure
# regress logged employment on interaction of population, state, year, only among non-coal counties
mod1 <- lm(log(value)~log(pop)*state_name*factor(year), data=datb[!(datb$fips_long %in% treated),], na.action=na.exclude)
# calculate difference between actual and predicted gov employment
datb$resid_gov <- log(datb$value) - predict(mod1, type="response", newdata = datb)
# create variable for predicted gov employment 
datb$pred_gov <- predict(mod1, type="response", newdata = datb)

# plot: predicted vs. actual government employees
ggplot(datb %>%
         # remove missing population
              filter(!is.na(pop)) %>%
         # prettify labels and create group variables
              mutate(g = case_when(pct1920>.2~"Coal County",
                                   pct1920<=.2~"Non-Coal County")) %>%
              mutate(pop_g = case_when(pop<25000~"Population < 25k",
                                       pop>=25000&pop<50000~"Population 25-50k",
                                       pop>=50000~"Population >= 50k")) %>%
              mutate(year_g = case_when(year<1900~"19th Century",
                                        year>=1900&year<1957~"1900 - 1940",
                                        year>=1957~"1957-Present")) %>%
              mutate(pop_g = factor(pop_g, levels=c("Population < 25k","Population 25-50k","Population >= 50k"), ordered=TRUE)) %>%
              mutate(year_g = factor(year_g, levels=c("19th Century","1900 - 1940","1957-Present"), ordered=TRUE))) +
  # plot predicted vs. actual
  geom_point(aes(x=exp(pred_gov), y=value)) +
  # facet by population group and year group
  facet_wrap(~pop_g+year_g+g, nrow=3) +
  # add 90 degree line
  geom_abline(slope=1, intercept=0) + 
  # log transform axes
  scale_x_continuous(trans='log', breaks = exp(c(1,3,5,7,9,11)), labels = scales::label_number(big.mark = ",")) +
  scale_y_continuous(trans='log', breaks = exp(c(1,3,5,7,9,11)), labels = scales::label_number(big.mark = ",")) +
  theme_bw() + theme(text=element_text(size=15),
                     axis.text.x = element_text(angle = 45, vjust = .5, hjust=.5)) + 
  xlab("Predicted Government Employees") + ylab("Actual Government Employees") +
  ggtitle("Predicted and Actual Government Employees")


# plot: time series

a <- datb %>%
  group_by(Year=year) %>%
  summarize(Employment=median(value, na.rm=T)) %>%
  ggplot() +
  geom_point(aes(x=Year, y=Employment)) +
  theme_bw() + theme(text=element_text(size=15)) +
  ggtitle("Untransformed")
b <- datb %>%
  group_by(Year=year) %>%
  summarize(Employment=median(resid_gov, na.rm=T)) %>%
  ggplot() +
  geom_point(aes(x=Year, y=Employment)) +
  theme_bw() + theme(text=element_text(size=15)) +
  ggtitle("Residualized") + ylim(c(-.175,.143))

gridExtra::grid.arrange(a,b, nrow=1)

### time series tests 

p_load(plm)

# add in blank rows for non-covered years to ensure correct spacing
datb_b <- datb %>%
  complete(nesting(fips_long), year = full_seq(year, period=1))
# create panel version of data
pdat <- pdata.frame(datb, index=c("fips_long", "year"), drop.index = TRUE, row.names = TRUE)

# alternately, aggregate by decade
datb_c <- datb %>%
  filter(!is.na(value)) %>%
  mutate(decade = floor(year / 10) * 10) %>%
  group_by(fips_long, decade) %>%
  summarise(resid_gov = mean(resid_gov, na.rm = TRUE),
            gov = mean(value, na.rm=T)) %>%
  ungroup()
# and create panel version
pdat_c <- pdata.frame(datb_c, index=c("fips_long", "decade"), drop.index = TRUE, row.names = TRUE)


# better-powered by-year structure for the stationarity tests
  # IPS: raw government employment
purtest(pdat$value, test="ips", lags=3, exo="intercept")
purtest(pdat$value, test="ips", lags=3, exo="trend")
  # IPS: residualized government employment
purtest(pdat$resid_gov, test="ips", lags=3, exo="intercept")
purtest(pdat$resid_gov, test="ips", lags=3, exo="trend")
  # Fisher: raw government employment
purtest(pdat$value, test="madwu", exo="intercept", lags=3)
purtest(pdat$value, test="madwu", exo="trend", lags=3)
  # Fisher: residualized government employment
purtest(pdat$resid_gov, test="madwu", exo="intercept", lags=3)
purtest(pdat$resid_gov, test="madwu", exo="trend", lags=3)

# decade structure for modeling decay, as it's less reliant
# on the presence of different county-years in different spacing bins

# exponential decay function
acf(pdat_c$resid_gov, na.action = na.pass, lag.max = 10)

acf_vals <- acf(pdat_c$resid_gov, plot = FALSE, na.action=na.pass)$acf[-1]
lags <- 1:length(acf_vals)
fit <- nls(acf_vals ~ a * exp(-b * lags), start = list(a = 1, b = 0.1))
summary(fit)

# ARIMA model
p_load(forecast)
fit <- auto.arima(pdat_c$resid_gov)
summary(fit)

phi <- c(0.31, 0.13, 0.06, 0.03)
irf <- ARMAtoMA(ar = phi, ma = 0, lag.max = 10)
plot(irf, type = "h", main = "Impulse Response Function")

# clean up workspace
rm(datb_b, datb_c, fit, mod1, pdat, pdat_c, acf_vals, irf, lags, phi)


  # A.2.3: validating the government employment measure ####

# to look at tax rates and coal employment in each year:
# merge in coal by year and property tax/values (to datb version from section A.2.2)
datc <- datb %>%
  full_join(dat %>%
              select(fips_long,
                     contains("pct1")&ends_with("0")) %>%
              pivot_longer(cols=c(-fips_long), names_to="year", values_to="coal", names_prefix="pct") %>%
              mutate(year = as.numeric(year))
  ) %>%
  full_join(dat %>%
              select(fips_long,
                     contains("pval")) %>%
              pivot_longer(cols=c(-fips_long), names_to="year", values_to="pval", names_prefix="pval") %>%
              mutate(year = as.numeric(year)) %>%
              mutate(year = case_when(year==1902~1900,
                                      year==1912~1910,
                                      year==1922~1920,
                                      year==1931~1930,
                                      T~year))
  ) %>%
  full_join(dat %>%
              select(fips_long,
                     contains("taxlocal")) %>%
              pivot_longer(cols=c(-fips_long), names_to="year", values_to="taxlocal", names_prefix="taxlocal") %>%
              mutate(year = as.numeric(year)) %>%
              mutate(year = case_when(year==1902~1900,
                                      year==1912~1910,
                                      year==1922~1920,
                                      year==1931~1930,
                                      T~year))
  )

datc <- datc %>% mutate(tax_rate = taxlocal/pval)

# test relationship between property taxes and residualized employment

mod1 <- lm_robust(log(taxlocal)~log(pval)+resid_gov, 
                  data=datc %>%  
                    filter(year %in% c(1880:1930)) %>% # limit to years with tax data available
                    filter(tax_rate<.1),  # limit to observations with sensible tax rates 
                  fixed_effects=~factor(year)+state_name)
summary(mod1)


# look at highest- and lowest-RGE county-years among coal counties
datc %>%
  filter(coal>.2) %>%
  slice_min(resid_gov, n = 2, by=state_name)

datc %>%
  filter(coal>.2) %>%
  slice_max(resid_gov, n = 2, by=state_name)

  # A.2.4: effects by cohort ####
####
# RUN #setup chunk under A.2
###

mod2 <- att_gt(yname="resid_gov",
               tname = "year",
               idname = "fips_long",
               gname = "firsthigh",
               data=datb %>% drop_na(pop) %>% mutate(fips_long = as.numeric(fips_long)),
               weightsname = "pop",
               control = "nevertreated",
               allow_unbalanced_panel = T)

ggplot(as.data.frame(cbind(mod2$group, mod2$t, mod2$att, mod2$se)),
       aes(x=V2-V1, y=V3, color=as.factor(V1))) + 
  geom_point() +
  geom_errorbar(aes(ymin=V3-(1.96*V4), ymax=V3+(1.96*V4))) +
  theme_bw() +
  ylab("ATE on Government Employment") +
  xlab("Time Relative to Treatment Onset")+
  scale_color_manual(values=c("gray80", "gray60", "gray40", 
                              "gray20", "black"),
                     name="Cohort") + 
  geom_hline(yintercept=0, linetype=2) +
  ggtitle("ATE by Treatment Cohort") +
  theme(text = element_text(size=20)) +
  facet_wrap(~as.factor(V1)) +
  xlim(c(-22,80))

  # A.2.5: characteristics of treated and comparison counties ####

# plot of coal employment trajectories for ever high-coal counties
dat %>%
  filter(highest>.4) %>%
  pivot_longer(cols=c(-fips_long, -county_name, -state_name), names_to="year", names_prefix="pct|mining") %>%
  mutate(treated = fips_long %in% treated) %>%
  ggplot() +
  geom_line(aes(x=as.numeric(year), y=value, group=factor(fips_long), color=treated)) +
  theme_bw() + 
  scale_color_manual(values=c("gray", "black"), name="Treated", label=c("Not Selected", "Selected")) +
  xlab("Year") + ylab("Prop. Coal in Lab Force") + 
  ggtitle("Pct. Mining in High Coal-Employment Counties: \n Treated Units Highlighted")

# table of characteristics


dat %>%
  mutate(treat = case_when(fips_long %in% treated~"T",
                           highest<.05~"C",
                           highest>=.05&!(fips_long %in% treated)~"O")) %>%
  group_by(treat) %>%
  summarize(n=n(),
            pop1870=median(pop1860_full, na.rm=T),
            pop1920=median(pop1920_full, na.rm=T),
            pop1970=median(pop1970, na.rm=T),
            pct1870=median(pct1860, na.rm=T),
            pct1920=median(pct1920, na.rm=T),
            pct1975=median(mining1975, na.rm=T),
            govpp1870=median(gov1870_full/pop1870_full*10000, na.rm=T),
            govpp1920=median(gov1920_full/pop1920_full*10000, na.rm=T),
            govpp1972=median(locgov_emp1972/pop1972*10000, na.rm=T),
            prop1880=median(pct1880, na.rm=T),
            prop1920=median(pct1920, na.rm=T)) %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  mutate(rowname = c("treat", "N", 
                     1870, 1920, 1970, 
                     1870, 1920, 1975,
                     1870, 1920, 1972,
                     1880, 1920)) %>%
  rename("Group:"="rowname", "Control"="V1", "Other"="V2", "Treated"="V3") %>%
  slice(-1) %>%
  as_tibble() %>%
  mutate(across(Control:Treated, ~round(as.numeric(.x), 2))) %>%
  kable(format="latex", booktabs = T) %>%
  pack_rows("Population", 2, 4) %>%
  pack_rows("Pct. Working in Coal", 5, 7) %>%
  pack_rows("Gov. Emp. Per 10k Residents", 8, 10) %>%
  pack_rows("Prop. in Largest Industry", 11, 12) 
  

# map of counties 
# variable for group 
dat <- dat %>% mutate(treat = case_when((fips_long %in% treated)~"Treated", 
                                        highest<.05&is.na(slavesperhouse)&coal_tons>0~"Control + Some Coal + No Slavery",
                                        highest<.05&is.na(slavesperhouse)~"Control + No Slavery",
                                        highest<.05&coal_tons>0~"Control + Some Coal Deposits", 
                                        highest<.05~"Control (Low Coal Emp)", 
                                        T~"None")) %>%
  mutate(treat = as.factor(treat))
# select plotting variables to merge in
plottab <- select(dat, fips_long, state_name, pct1920, treat)
plottab$fips <- plottab$fips_long
# select states to plot
states <- c("Indiana", "Kentucky", "West Virginia", "Ohio", "Illinois", "Pennsylvania")

# plot 
plot_usmap(regions="counties", data=plottab, values="treat",
           include=states) +
  scale_fill_manual(name="Group", 
                    values=c( "blue", "dodgerblue4", "navy","lightskyblue", "transparent", "red")) +
  ggtitle("Prop. Labor Force in Coal Industry, 1920") +
  theme(text=element_text(size=15), legend.position = "right")


  # A.2.6: robustness checks ####

    # omit slaveholding counties

# create panel version of data 
datb <- dat %>%
  # remove  slaveholding counties 
  filter(is.na(slavesperhouse)) %>%
  # limit to counties with high or no coal industry
  filter(highest<.05|highest>.2) %>%
  # select and pivot to long: government employment
  select(fips_long, pct1920, firsthigh, county_name, state_name, 
         contains("gov")&ends_with("full"), contains("locgov_emp")) %>%
  pivot_longer(cols=c(-fips_long, -pct1920, -firsthigh, -county_name, -state_name), names_to="year", names_prefix="gov|_full|locgov_emp") %>%
  # select and pivot to long: population
  full_join(dat %>%
              filter(highest<.05|highest>.2) %>%
              select(fips_long,
                     contains("pop")&ends_with("full"), contains(c("Population", "pop197", "pop198", "pop199", "pop2")), -contains("lab")) %>%
              pivot_longer(cols=c(-fips_long), names_to="year", values_to="pop", names_prefix="pop|_full|Population_")
  ) %>%
  # limit to years with employment data 
  filter(year %in% c(1850, 1860, 1870, 1880, 1900, 1910, 1920, 1930, 1940, 
                     1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997,
                     2002, 2007)) %>%
  # recode gov employment variable: set to NA when population missing, and add 1 for logging
  mutate(value = case_when(!is.na(pop)~value+1)) %>%
  # set year to numeric
  mutate(year = as.numeric(year)) %>%
  # set first year of high coal employment to 0 if never high coal employment (for did package)
  mutate(firsthigh = case_when(!is.na(firsthigh)~firsthigh,
                               T~0)) %>%
  # indicator for post-treatment
  mutate(post = case_when(firsthigh==0~0,
                          year>firsthigh~1,
                          T~0)) %>%
  # limit to counties with never high coal employment or in treated group
  filter(firsthigh==0|fips_long %in% treated)

# calculate residualized government employment measure
# regress logged employment on interaction of population, state, year, only among non-coal counties
mod1 <- lm(log(value)~log(pop)*state_name*factor(year), data=datb[!(datb$fips_long %in% treated),], na.action=na.exclude)
# calculate difference between actual and predicted gov employment
datb$resid_gov <- log(datb$value) - predict(mod1, type="response", newdata = datb)

# run difference in differences model
mod1 <- att_gt(yname="resid_gov",
               tname = "year",
               idname = "fips_long",
               gname = "firsthigh",
               data=datb %>% 
                 drop_na(pop) %>% 
                 mutate(fips_long = as.numeric(fips_long)) ,
               weightsname = "pop",
               control = "nevertreated",
               allow_unbalanced_panel = T)

# aggregate across cohorts
did_out <- aggte(mod1, type = "dynamic", 
                 na.rm=T,
                 max_e = 77,
                 min_e=-20)

# plot
ggdid(did_out) + theme_bw() + 
  xlab("Years pre/post-treatment") + 
  ylab("Diff in diffs: resid. gov emp.") + 
  scale_color_manual(name="Time",
                     label=c("Pre-treat",
                             "Post-treat"),
                     values=c("grey", "black")) +
  geom_hline(yintercept=0, linetype=3) + 
  theme(text = element_text(size=20)) + 
  scale_x_continuous(breaks=c(seq(-20, 80, 10)))



   # omit counties without any coal deposits

# create panel version of data 
datb <- dat %>%
  # remove  slaveholding counties 
  filter(coal_tons>0) %>%
  # limit to counties with high or no coal industry
  filter(highest<.05|highest>.2) %>%
  # select and pivot to long: government employment
  select(fips_long, pct1920, firsthigh, county_name, state_name, 
         contains("gov")&ends_with("full"), contains("locgov_emp")) %>%
  pivot_longer(cols=c(-fips_long, -pct1920, -firsthigh, -county_name, -state_name), names_to="year", names_prefix="gov|_full|locgov_emp") %>%
  # select and pivot to long: population
  full_join(dat %>%
              filter(highest<.05|highest>.2) %>%
              select(fips_long,
                     contains("pop")&ends_with("full"), contains(c("Population", "pop197", "pop198", "pop199", "pop2")), -contains("lab")) %>%
              pivot_longer(cols=c(-fips_long), names_to="year", values_to="pop", names_prefix="pop|_full|Population_")
  ) %>%
  # limit to years with employment data 
  filter(year %in% c(1850, 1860, 1870, 1880, 1900, 1910, 1920, 1930, 1940, 
                     1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997,
                     2002, 2007)) %>%
  # recode gov employment variable: set to NA when population missing, and add 1 for logging
  mutate(value = case_when(!is.na(pop)~value+1)) %>%
  # set year to numeric
  mutate(year = as.numeric(year)) %>%
  # set first year of high coal employment to 0 if never high coal employment (for did package)
  mutate(firsthigh = case_when(!is.na(firsthigh)~firsthigh,
                               T~0)) %>%
  # indicator for post-treatment
  mutate(post = case_when(firsthigh==0~0,
                          year>firsthigh~1,
                          T~0)) %>%
  # limit to counties with never high coal employment or in treated group
  filter(firsthigh==0|fips_long %in% treated)

# calculate residualized government employment measure
# regress logged employment on interaction of population, state, year, only among non-coal counties
mod1 <- lm(log(value)~log(pop)*state_name*factor(year), data=datb[!(datb$fips_long %in% treated),], na.action=na.exclude)
# calculate difference between actual and predicted gov employment
datb$resid_gov <- log(datb$value) - predict(mod1, type="response", newdata = datb)

# run difference in differences model
mod1 <- att_gt(yname="resid_gov",
               tname = "year",
               idname = "fips_long",
               gname = "firsthigh",
               data=datb %>% 
                 drop_na(pop) %>% 
                 mutate(fips_long = as.numeric(fips_long)) ,
               weightsname = "pop",
               control = "nevertreated",
               allow_unbalanced_panel = T)

# aggregate across cohorts
did_out <- aggte(mod1, type = "dynamic", 
                 na.rm=T,
                 max_e = 77,
                 min_e=-20)

# plot
ggdid(did_out) + theme_bw() + 
  xlab("Years pre/post-treatment") + 
  ylab("Diff in diffs: residualized gov emp.") + 
  scale_color_manual(name="Time",
                     label=c("Pre-treat",
                             "Post-treat"),
                     values=c("grey", "black")) +
  geom_hline(yintercept=0, linetype=3) + 
  theme(text = element_text(size=20)) + 
  scale_x_continuous(breaks=c(seq(-20, 80, 10)))


# leave one out

# identify treated counties
treated <- c(17055, 17155, 18055, 21013, 21095, 21107, 21121, 21193, 21195,
             21235, 39127, 42063, 54019, 54045, 54081, 54109, 17027, 18021,
             18125, 42059, 54057, 54093)

# create panel version of data 
datb <- dat %>%
  # limit to counties with high or no coal industry
  filter(highest<.05|highest>.2) %>%
  # select and pivot to long: government employment
  select(fips_long, pct1920, firsthigh, county_name, state_name, 
         contains("gov")&ends_with("full"), contains("locgov_emp")) %>%
  pivot_longer(cols=c(-fips_long, -pct1920, -firsthigh, -county_name, -state_name), names_to="year", names_prefix="gov|_full|locgov_emp") %>%
  # select and pivot to long: population
  full_join(dat %>%
              filter(highest<.05|highest>.2) %>%
              select(fips_long,
                     contains("pop")&ends_with("full"), contains(c("Population", "pop197", "pop198", "pop199", "pop2")), -contains("lab")) %>%
              pivot_longer(cols=c(-fips_long), names_to="year", values_to="pop", names_prefix="pop|_full|Population_")
  ) %>%
  # limit to years with employment data 
  filter(year %in% c(1850, 1860, 1870, 1880, 1900, 1910, 1920, 1930, 1940, 
                     1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997,
                     2002, 2007)) %>%
  # recode gov employment variable: set to NA when population missing, and add 1 for logging
  mutate(value = case_when(!is.na(pop)~value+1)) %>%
  # set year to numeric
  mutate(year = as.numeric(year)) %>%
  # set first year of high coal employment to 0 if never high coal employment (for did package)
  mutate(firsthigh = case_when(!is.na(firsthigh)~firsthigh,
                               T~0)) %>%
  # indicator for post-treatment
  mutate(post = case_when(firsthigh==0~0,
                          year>firsthigh~1,
                          T~0)) %>%
  # limit to counties with never high coal employment or in treated group
  filter(firsthigh==0|fips_long %in% treated)

# calculate residualized government employment measure
# regress logged employment on interaction of population, state, year, only among non-coal counties
mod1 <- lm(log(value)~log(pop)*state_name*factor(year), data=datb[!(datb$fips_long %in% treated),], na.action=na.exclude)
# calculate difference between actual and predicted gov employment
datb$resid_gov <- log(datb$value) - predict(mod1, type="response", newdata = datb)




res <- data.frame(matrix(nrow=length(treated), ncol=2))
colnames(res) <- c("att", "se")

for(i in 1:length(treated)){
  temp <- datb %>% 
    filter(fips_long!=treated[i]) %>% 
    filter((year - firsthigh)<=40|(year - firsthigh)>1000) %>%
    drop_na(pop)
  mod3 <- att_gt(yname="resid_gov",
                 tname = "year",
                 idname = "fips_long",
                 gname = "firsthigh",
                 data=temp,
                 weightsname = "pop",
                 control = "nevertreated",
                 allow_unbalanced_panel = T)
  mod3agg <- aggte(mod3, type = "simple", na.rm=T)
  res[i,] <- c(mod3agg$overall.att, mod3agg$overall.se)
  print(i)
}


res$y <- rnorm(n=22, mean=1, sd=.1)

ggplot(res) + 
  geom_point(aes(x=att, y=y), color="gray60") + 
  geom_errorbarh(aes(xmin=(att-1.96*se), xmax=(att+1.96*se),  y=y), color="gray60") +
  coord_cartesian(xlim=c(-.65, 0), ylim=c(0,2)) +
  theme_bw() + 
  geom_vline(xintercept=-.2985, linetype=2) +
  ylab("") + scale_y_continuous(breaks=NULL) +
  xlab("Leave-one-out ATT Estimates") + 
  annotate("text", x=-.29, y=1.5, label="Full Sample ATT (-.30)", hjust=0, size=5) + 
  theme(text = element_text(size=20)) 



# alternative thresholds 

# original treated group
treated <- c(17055, 17155, 18055, 21013, 21095, 21107, 21121, 21193, 21195,
             21235, 39127, 42063, 54019, 54045, 54081, 54109, 17027, 18021,
             18125, 42059, 54047, 54093) 
# variable for first year with very high coal employment
dat <- dat %>% 
  mutate(firsthigh_d = case_when(highest<.4~NA_real_,
                                 pct1850>=.4~1850,
                                 pct1860>=.4~1860,
                                 pct1870>=.4~1870,
                                 pct1880>=.4~1880,
                                 pct1900>=.4~1900,
                                 pct1910>=.4~1910,
                                 pct1920>=.4~1920,
                                 pct1930>=.4~1930,
                                 pct1940>=.4~1940,
                                 T~NA_real_))
# variable for first year with any appreciable coal employment
dat <- dat %>% 
  mutate(firstsome_d = case_when(highest<.4~NA_real_,
                                 pct1850>.05~1850,
                                 pct1860>.05~1860,
                                 pct1870>.05~1870,
                                 pct1880>.05~1880,
                                 pct1900>.05~1900,
                                 pct1910>.05~1910,
                                 pct1920>.05~1920,
                                 pct1930>.05~1930,
                                 pct1940>.05~1940, 
                                 T~NA_real_))
# variable for first year with data available
dat <- dat %>%
  mutate(firstdata = case_when(highest<.4~NA_real_,
                               !is.na(pct1850)~1850,
                               !is.na(pct1860)~1860,
                               !is.na(pct1870)~1870,
                               !is.na(pct1880)~1880,
                               !is.na(pct1900)~1900,
                               !is.na(pct1910)~1910,
                               !is.na(pct1920)~1920,
                               !is.na(pct1930)~1930,
                               !is.na(pct1940)~1940, 
                               T~NA_real_))

# recover original treatment group
treated_b <- dat %>%
  filter(highest>.4&
           firsthigh==firstsome_d&
           firsthigh>1860&
           firsthigh_d<1975&
           (firsthigh_d-firsthigh)<=20&
           firstdata!=firsthigh) %>%
  select(fips_long) 
treated_b <- treated_b$fips_long
# check that all are present and accounted for
treated[!(treated %in% treated_b)]
treated_b[!(treated_b %in% treated)]


# function to create treatment assignment vector based on thresholds for each variable
create_tvec <- function(high_top=.4, high_bottom=.2, some=.05){
  dat <- dat %>% 
    mutate(firsthigh_l = case_when(highest<high_top~NA_real_,
                                   pct1850>=high_bottom~1850,
                                   pct1860>=high_bottom~1860,
                                   pct1870>=high_bottom~1870,
                                   pct1880>=high_bottom~1880,
                                   pct1900>=high_bottom~1900,
                                   pct1910>=high_bottom~1910,
                                   pct1920>=high_bottom~1920,
                                   pct1930>=high_bottom~1930,
                                   pct1940>=high_bottom~1940,
                                   T~NA_real_))
  
  dat <- dat %>% 
    mutate(firsthigh_h = case_when(highest<high_top~NA_real_,
                                   pct1850>=high_top~1850,
                                   pct1860>=high_top~1860,
                                   pct1870>=high_top~1870,
                                   pct1880>=high_top~1880,
                                   pct1900>=high_top~1900,
                                   pct1910>=high_top~1910,
                                   pct1920>=high_top~1920,
                                   pct1930>=high_top~1930,
                                   pct1940>=high_top~1940,
                                   T~NA_real_))
  
  dat <- dat %>% 
    mutate(firstsome_d = case_when(highest<high_top~NA_real_,
                                   pct1850>some~1850,
                                   pct1860>some~1860,
                                   pct1870>some~1870,
                                   pct1880>some~1880,
                                   pct1900>some~1900,
                                   pct1910>some~1910,
                                   pct1920>some~1920,
                                   pct1930>some~1930,
                                   pct1940>some~1940, 
                                   T~NA_real_))
  
  treated_b <- dat %>%
    filter(highest>high_top&
             firsthigh==firstsome_d&
             firsthigh>1860&
             firsthigh_d<1975&
             (firsthigh_d-firsthigh)<=20&
             firstdata!=firsthigh) %>%
    select(fips_long) 
  treated_b <- treated_b$fips_long
  return(treated_b)
}
# function to create control assignments based on threshold
create_cvec <- function(none=.05){
  control <- (dat %>% filter(highest<none) %>% select(fips_long))$fips_long
  return(control)
}

# test creating thresholds
cvec <- create_cvec(none=.05)
tvec <- create_tvec(high_top=.4, high_bottom=.2, some=.05)

# function for getting ATT estimate by thresholds
run_estimate <- function(cvec, tvec, high_bottom, high_top){
  
  # identify relevant observations
  dat <- dat %>% 
    mutate(firsthigh_l = case_when(highest<high_top~NA_real_,
                                   pct1850>=high_bottom~1850,
                                   pct1860>=high_bottom~1860,
                                   pct1870>=high_bottom~1870,
                                   pct1880>=high_bottom~1880,
                                   pct1900>=high_bottom~1900,
                                   pct1910>=high_bottom~1910,
                                   pct1920>=high_bottom~1920,
                                   pct1930>=high_bottom~1930,
                                   pct1940>=high_bottom~1940,
                                   T~NA_real_))
  # create panel data
  datb <- dat %>%
    # select and pivot to long: government employment
    select(fips_long, pct1920, firsthigh_l, county_name, state_name, 
           contains("gov")&ends_with("full"), contains("locgov_emp")) %>%
    pivot_longer(cols=c(-fips_long, -pct1920, -firsthigh_l, -county_name, -state_name), names_to="year", names_prefix="gov|_full|locgov_emp") %>%
    full_join(dat %>%
                select(fips_long,
                       contains("pop")&ends_with("full"), contains(c("Population", "pop197", "pop198", "pop199", "pop2")), -contains("lab")) %>%
                pivot_longer(cols=c(-fips_long), names_to="year", values_to="pop", names_prefix="pop|_full|Population_")
    ) %>%
    # limit to years with employment data 
    filter(year %in% c(1850, 1860, 1870, 1880, 1900, 1910, 1920, 1930, 1940, 
                       1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997,
                       2002, 2007)) %>%
    # recode gov employment variable: set to NA when population missing, and add 1 for logging
    mutate(value = case_when(!is.na(pop)~value+1)) %>%
    mutate(year = as.numeric(year)) %>%
    filter(year<2012)
  
  # filter to treated and control observations and create indicators
  datb <- datb %>%
    filter((fips_long %in% cvec)|(fips_long %in% tvec)) %>%
    mutate(firsthigh_l = case_when(!is.na(firsthigh_l)~firsthigh_l,
                                   T~0)) %>%
    mutate(post = case_when(firsthigh_l==0~0,
                            year>firsthigh_l~1,
                            T~0)) 
  
  # calculate residualized employment
  mod1 <- lm(log(value)~log(pop)*state_name*factor(year), data=datb[!(datb$fips_long %in% treated),], na.action=na.exclude)
  datb$resid_gov <- log(datb$value) - predict(mod1, type="response", newdata = datb)
  
  # run diff in diff model
  mod3 <- att_gt(yname="resid_gov",
                 tname = "year",
                 idname = "fips_long",
                 gname = "firsthigh_l",
                 data=datb %>% drop_na(pop) %>% filter((year - firsthigh_l)<=40|(year - firsthigh_l)>1000) %>% mutate(fips_long = as.numeric(fips_long)),
                 weightsname = "pop",
                 control = "nevertreated",
                 allow_unbalanced_panel = T)
  res <- aggte(mod3, type = "simple", na.rm=T)
  return(res)
}

# run original estimate
out <- run_estimate(cvec, tvec, high_top=.4, high_bottom=.2)

# create table to store results
tab1 <- data.frame("high_top"=numeric(length = 45),
                   "high_bottom"=numeric(length = 45),
                   "some"=numeric(length = 45),
                   "none"=numeric(length = 45),
                   "att"=numeric(length = 45),
                   "se"=numeric(length = 45))

# set thresholds
tab1$some <- rep(c(.01, .05, .1), 15)
tab1$none <- rep(c(.01, .05, .1), 15)
tab1$high_top <- rep(c(rep(.2, 3), rep(.3, 3), rep(.4, 3), rep(.5, 3), rep(.6, 3)), 3)
tab1 <- bind_rows(tab1, tab1, tab1)
tab1$high_bottom <- c(rep(.1, 45), rep(.2, 45), rep(.3, 45))

# create vectors for treated and control assignments
tab1$tvec <- NA
tab1$cvec <- NA
for(i in 1:nrow(tab1)){
  tab1$tvec[i] <- list(create_tvec(high_top=tab1$high_top[i], high_bottom=tab1$high_bottom[i], some=tab1$some[i]))
  tab1$cvec[i] <- list(create_cvec(none=tab1$none[i]))
}

# remove duplicates
tab1 <- tab1 %>% distinct()
# run for all combinations
for(i in 1:nrow(tab1)){
  out <- run_estimate(unlist(tab1$cvec[i]), unlist(tab1$tvec[i]),
                      high_bottom=tab1$high_bottom[i], high_top = tab1$high_top[i])
  tab1$att[i] <- out$overall.att
  tab1$se[i] <- out$overall.se
  print(i)
}

# add labels
tab1 <- bind_rows(tab1, 
                  c("high_top"=.4, "high_bottom"=.2, "some"=.05, "none"=.05, "att"=out$overall.att, "se"=out$overall.se))

tab1$main <- ifelse(tab1$high_top==.4&tab1$high_bottom==.2&tab1$some==.05, "Primary Model", "Alternatives")

# plot results 
ggplot(tab1 %>%
              distinct(high_bottom, high_top, some, .keep_all=TRUE) %>%
              mutate(across(c(high_bottom, some, main), as.character)) %>%
              mutate(high_bottom = case_when(high_bottom==.1~"Min. First Year Coal Ind.: 0.1",
                                             high_bottom==.2~"Min. First Year Coal Ind.: 0.2",
                                             high_bottom==.3~"Min. First Year Coal Ind.: 0.3")) %>%
              mutate(some = case_when(some==.01~"Max. for No Coal Ind.: 0.01",
                                      some==.05~"Max. for No Coal Ind.: 0.05",
                                      some==.1~"Max. for No Coal Ind.: 0.1"))) +
  geom_point(aes(x=high_top, y=att, color=main)) +
  geom_errorbar(aes(x=high_top, ymin=(att-1.96*se), ymax=(att+1.96*se), color=main), width=.01) +
  facet_wrap(~some+high_bottom, nrow=3) +
  theme_bw() + theme(text=element_text(size=15)) +
  xlab("Minimum Peak Coal Employment") + ylab("Estimated ATT") +
  geom_hline(aes(yintercept=-.31), linetype=3) +
  geom_hline(aes(yintercept=0), linetype=2, color="gray60") +
  scale_color_manual(name="Specification", values=c("black", "blue")) +
  ggtitle("Estimated ATT with Alternative Treatment Thresholds")



  # A.2.7: alternative models ####

# extended TWFE (mundlak)
p_load(etwfe)
datb <- datb %>% left_join(dat %>% select(fips_long, coal_tons))
mod1 = etwfe(
  fml  = resid_gov~1, 
  tvar = year,        
  gvar = firsthigh,
  data = datb %>% filter(!is.na(pop)) %>% filter(coal_tons>0),    
  vcov = ~fips_long,
  cgroup = "never"
)
ests <- emfx(mod1, type="event")

plot(ests, main="Effect on Government Employment by Year")


# synthetic control 
p_load(Synth)

datb <- datb %>% mutate(unit = as.numeric(fips_long))
# set potential controls
controls <- datb %>%
  filter(firsthigh==0) %>%
  filter(!(fips_long %in% c("21039", "21201", "42023", "21087", "21187", "21215"))) %>%
  select(unit) %>%
  distinct() %>%
  unlist()

# find matched control counties by cohort

  # 1920
# potential controls
dat_c <- datb %>% filter(unit %in% controls)
# treated units
dat_t <- datb %>% 
  filter(firsthigh==1920) %>%
  group_by(year) %>%
  summarize(resid_gov=median(resid_gov, na.rm=T)) %>%
  mutate(unit=1)
# combine
dat_sub <- bind_rows(dat_t, 
                     dat_c %>% select(year, resid_gov, unit))

# prepare for matching
db <- dataprep(foo=as.data.frame(dat_sub),
               dependent="resid_gov",
               unit.variable = "unit",
               time.variable="year",
               predictors=c("resid_gov"),
               time.predictors.prior=c(1860, 1870, 1880, 1900, 1910),
               time.optimize.ssr =c(1860, 1870, 1880, 1900, 1910),
               treatment.identifier=1,
               controls.identifier=controls)
# run synthetic
mod1 <- synth(db)
synth.tables <- synth.tab(
  dataprep.res = db,
  synth.res = mod1)
path.plot(dataprep.res = db,synth.res = mod1)
# extract weights etc
w <- as.data.frame(mod1$solution.w)
w$unit <- as.numeric(rownames(mod1$solution.w))

# combine into frame for plotting
d1920 <- dat_sub %>%
  left_join(w) %>%
  filter(unit!=1) %>%
  bind_rows(datb %>% filter(firsthigh==1920) %>% select(year, resid_gov, unit) %>% mutate(unit=1)) %>%
  mutate(w.weight = case_when(unit==1~1,
                              T~w.weight)) %>%
  mutate(cohort = 1920)


  # 1910
dat_t <- datb %>% 
  filter(firsthigh==1910) %>%
  group_by(year) %>%
  summarize(resid_gov=median(resid_gov, na.rm=T)) %>%
  mutate(unit=1)
dat_sub <- bind_rows(dat_t, 
                     dat_c %>% select(year, resid_gov, unit))


db <- dataprep(foo=as.data.frame(dat_sub),
               dependent="resid_gov",
               unit.variable = "unit",
               time.variable="year",
               predictors=c("resid_gov"),
               time.predictors.prior=c(1860, 1870, 1880, 1900),
               time.optimize.ssr =c(1860, 1870, 1880, 1900),
               treatment.identifier=1,
               controls.identifier=controls)
mod1 <- synth(db)
synth.tables <- synth.tab(
  dataprep.res = db,
  synth.res = mod1)
path.plot(dataprep.res = db,synth.res = mod1)

w <- as.data.frame(mod1$solution.w)
w$unit <- as.numeric(rownames(mod1$solution.w))

d1910 <- dat_sub %>%
  left_join(w) %>%
  filter(unit!=1) %>%
  bind_rows(datb %>% filter(firsthigh==1910) %>% select(year, resid_gov, unit) %>% mutate(unit=1)) %>%
  mutate(w.weight = case_when(unit==1~1,
                              T~w.weight)) %>%
  mutate(cohort = 1910)

  # 1900
dat_t <- datb %>% 
  filter(firsthigh==1900) %>%
  group_by(year) %>%
  summarize(resid_gov=median(resid_gov, na.rm=T)) %>%
  mutate(unit=1)
dat_sub <- bind_rows(dat_t, 
                     dat_c %>% select(year, resid_gov, unit))


db <- dataprep(foo=as.data.frame(dat_sub),
               dependent="resid_gov",
               unit.variable = "unit",
               time.variable="year",
               predictors=c("resid_gov"),
               time.predictors.prior=c(1860, 1870, 1880),
               time.optimize.ssr =c(1860, 1870, 1880),
               treatment.identifier=1,
               controls.identifier=controls)
mod1 <- synth(db)
synth.tables <- synth.tab(
  dataprep.res = db,
  synth.res = mod1)
path.plot(dataprep.res = db,synth.res = mod1)

w <- as.data.frame(mod1$solution.w)
w$unit <- as.numeric(rownames(mod1$solution.w))

d1900 <- dat_sub %>%
  left_join(w) %>%
  filter(unit!=1) %>%
  bind_rows(datb %>% filter(firsthigh==1900) %>% select(year, resid_gov, unit) %>% mutate(unit=1)) %>%
  mutate(w.weight = case_when(unit==1~1,
                              T~w.weight)) %>%
  mutate(cohort = 1900) 

  # 1880
dat_t <- datb %>% 
  filter(firsthigh==1880) %>%
  group_by(year) %>%
  summarize(resid_gov=median(resid_gov, na.rm=T)) %>%
  mutate(unit=1)
dat_sub <- bind_rows(dat_t, 
                     dat_c %>% select(year, resid_gov, unit))


db <- dataprep(foo=as.data.frame(dat_sub),
               dependent="resid_gov",
               unit.variable = "unit",
               time.variable="year",
               predictors=c("resid_gov"),
               time.predictors.prior=c(1860, 1870),
               time.optimize.ssr =c(1860, 1870),
               treatment.identifier=1,
               controls.identifier=controls)
mod1 <- synth(db)
synth.tables <- synth.tab(
  dataprep.res = db,
  synth.res = mod1)
path.plot(dataprep.res = db,synth.res = mod1)

w <- as.data.frame(mod1$solution.w)
w$unit <- as.numeric(rownames(mod1$solution.w))

d1880 <- dat_sub %>%
  left_join(w) %>%
  filter(unit!=1) %>%
  bind_rows(datb %>% filter(firsthigh==1880) %>% select(year, resid_gov, unit) %>% mutate(unit=1)) %>%
  mutate(w.weight = case_when(unit==1~1,
                              T~w.weight)) %>%
    mutate(cohort = 1880) 

  # 1870
dat_t <- datb %>% 
  filter(firsthigh==1870) %>%
  group_by(year) %>%
  summarize(resid_gov=median(resid_gov, na.rm=T)) %>%
  mutate(unit=1)
dat_sub <- bind_rows(dat_t, 
                     dat_c %>% select(year, resid_gov, unit))


db <- dataprep(foo=as.data.frame(dat_sub),
               dependent="resid_gov",
               unit.variable = "unit",
               time.variable="year",
               predictors=c("resid_gov"),
               time.predictors.prior=c(1860),
               time.optimize.ssr =c(1860),
               treatment.identifier=1,
               controls.identifier=controls)
mod1 <- synth(db)
synth.tables <- synth.tab(
  dataprep.res = db,
  synth.res = mod1)
path.plot(dataprep.res = db,synth.res = mod1)

w <- as.data.frame(mod1$solution.w)
w$unit <- as.numeric(rownames(mod1$solution.w))

d1870 <- dat_sub %>%
  left_join(w) %>%
  filter(unit!=1) %>%
  bind_rows(datb %>% filter(firsthigh==1870) %>% select(year, resid_gov, unit) %>% mutate(unit=1)) %>%
  mutate(w.weight = case_when(unit==1~1,
                              T~w.weight)) %>%
    mutate(cohort = 1870) 

# combine across years
d_all <- bind_rows(d1920, d1910, d1900, d1880, d1870)
# remove extras
rm(d1920, d1910, d1900, d1880, d1870)

# calculate year relative to treatment
d_all$rel_year <- d_all$year - d_all$cohort

# plot values
a <- d_all %>%
  group_by(c = unit==1, rel_year) %>%
  summarize(mn=weighted.mean(resid_gov, w=w.weight, na.rm=T)) %>%
  mutate(Group = case_when(c==TRUE~"Coal Counties      ",
                           c==FALSE~"Controls")) %>%
  ggplot() + 
  geom_point(aes(x=rel_year, y=mn, color=Group)) +
  coord_cartesian(ylim=c(-2, 1)) + 
  geom_vline(xintercept=0) +
  theme_bw() + xlab("Year Relative to Coal Arrival") +
  ylab("Government Employment") + 
  theme(text=element_text(size=15)) +
  scale_color_manual(values=c("black", "gray"))
# plot difference
b <- d_all %>%
  group_by(c = unit==1, rel_year) %>%
  summarize(mn=weighted.mean(resid_gov, w=w.weight, na.rm=T)) %>%
  mutate(Group = case_when(c==TRUE~"t",
                           c==FALSE~"c")) %>%
  pivot_wider(id_cols=c(rel_year), names_from=Group, values_from=mn) %>%
  mutate(diff=t-c) %>%
  mutate(Statistic="Difference (T - C)") %>%
  ggplot() + 
  geom_point(aes(x=rel_year, y=diff, color=Statistic)) +
  coord_cartesian(ylim=c(-1, .5)) + 
  geom_hline(yintercept=0) +
  geom_vline(xintercept=0) +
  theme_bw() + xlab("Year Relative to Coal Arrival") +
  ylab("Difference") + 
  theme(text=element_text(size=15)) +
  scale_color_manual(values=c("black"))


grid.arrange(a,b, nrow=2)



rm(a, b, d_all, dat_c, dat_sub, dat_t, datb, datc, db, did_out, dodge, ests, 
   mod1, mod2, mod3, mod3agg, out, plottab, pre_mod, res, synth.tables, tab1, temp, w)
# A.3: Instrumental Variables ####
  # setup ####

# create spending total variables (rather than per capita)
dat$spend1957 <- dat$Total.Expenditure_PC_1957*dat$Population_1957
dat$spend1972 <- dat$Total.Expenditure_PC_1972*dat$pop1972
dat$spend1977 <- dat$Total.Expenditure_PC_1977*dat$pop1977
dat$spend1982 <- dat$Total.Expenditure_PC_1982*dat$pop1982
dat$spend1992 <- dat$Total.Expenditure_PC_1992*dat$pop1992

# create panel dataset with employment, spending, revenue, property values, coal
datb <- dat %>%
  select(fips_long, county_name, state_name, 
         contains("gov")&ends_with("full"), contains("locgov_emp"), -contains("old")) %>%
  pivot_longer(cols=c(-fips_long, -county_name, -state_name), names_to="year", values_to="gov", names_prefix="gov|_full|locgov_emp") %>%
  full_join(dat %>%
              select(fips_long,
                     contains("pop")&ends_with("full"), contains(c("Population", "pop197", "pop198", "pop199", "pop2")), -contains("lab")) %>%
              pivot_longer(cols=c(-fips_long), names_to="year", values_to="pop", names_prefix="pop|_full|Population_")
  ) %>%
  full_join(dat %>%
              select(fips_long,
                     contains("pct1"), -contains("quartile")) %>%
              pivot_longer(cols=c(-fips_long), names_to="year", values_to="pct", names_prefix="pct")
  ) %>%
  full_join(dat %>%
              select(fips_long,
                     contains("taxlocal"), contains("proptax"), -contains("percap")) %>%
              pivot_longer(cols=c(-fips_long), names_to="year", values_to="revenue", names_prefix="taxlocal|proptax") %>%
              mutate(year = case_when(year=="1902"~"1900",
                                      year=="1912"~"1910",
                                      year=="1922"~"1920", 
                                      year=="1931"~"1930",
                                      T~year))
  ) %>%
  full_join(dat %>%
              select(fips_long,
                     contains("spend")) %>%
              pivot_longer(cols=c(-fips_long), names_to="year", values_to="spend", names_prefix="spend")
  ) %>%
  full_join(dat %>%
              select(fips_long,
                     contains("pval"), contains("Gross_AV_Total_")) %>%
              pivot_longer(cols=c(-fips_long), names_to="year", values_to="pval", names_prefix="pval|Gross_AV_Total_") %>%
              mutate(year = case_when(year=="1902"~"1900",
                                      year=="1912"~"1910",
                                      year=="1922"~"1920", 
                                      year=="1931"~"1930",
                                      T~year)) %>%
              filter(year!=12) %>%
              mutate(pval = case_when(year>1940~pval*1000, 
                                      T~pval))
  ) %>%
  left_join(dat %>% select(fips_long, pct1920, coal_tons)) %>%
  mutate(pop = case_when(pop!=0~pop)) %>%
  mutate(year = as.numeric(year)) %>%
  mutate(pct = rescale(pct, to=c(0,1))) %>%
  mutate(pct1920 = rescale(pct1920, to=c(0,1)))



  # A.3.1: full model results ####
options(modelsummary_factory_default = 'kableExtra')

# employment and revenu pre-1950
mod1 <- lm_robust(log(gov+1)~pct+log(pop), data=datb %>% filter(year<1950), fixed_effects=year+state_name, clusters=fips_long, se_type="stata", weights=pop)
mod3 <- lm_robust(log(revenue+1)~pct+log(pop), data=datb %>% filter(year<1950), fixed_effects=year+state_name, clusters=fips_long, se_type="stata", weights=pop)
mod2 <- iv_robust(log(gov+1)~pct+log(pop)|coal_tons+log(pop), data=datb %>% filter(year<1950), fixed_effects=year, clusters=fips_long, se_type="stata", weights=pop)
mod4 <- iv_robust(log(revenue+1)~pct+log(pop)|coal_tons+log(pop), data=datb %>% filter(year<1950), fixed_effects=year, clusters=fips_long, se_type="stata", weights=pop)

# print table
rows <- tribble(~term, ~a, ~b, ~c, ~d,
                "Fixed Effects", "State Year", "State Year", "State Year", "State Year",
                "N Clusters", as.character(mod1$nclusters), as.character(mod2$nclusters), 
                as.character(mod3$nclusters), as.character(mod4$nclusters))
attr(rows, 'position') <- c(6:8)
modelsummary(list("Employment: OLS"=mod1, "Employment: IV"=mod2, "Revenue: OLS"=mod3, "Revenue: IV"=mod4), 
             output = "kableExtra", format="latex",
             coef_map = c(
               "pct"="Prop. Labor in Coal",
               "log(pop)"="Log(Population Size)"),
             gof_omit="Std.Errors|AIC|BIC|RMSE",
             stars = TRUE, title = "Coal Employment and Local Government Capacity: Pre-1950",
             add_rows=rows) %>%
  kable_styling(latex_options = "hold_position") 


# employment, revenue, and spending post-1950
mod1 <- lm_robust(log(gov+1)~pct1920+log(pop), data=datb %>% filter(year>1950), fixed_effects=year+state_name, clusters=fips_long, se_type="stata", weights=pop)
mod2 <- iv_robust(log(gov+1)~pct1920+log(pop)|coal_tons+log(pop), data=datb %>% filter(year>1950), fixed_effects=year, clusters=fips_long, se_type="stata", weights=pop)

mod3 <- lm_robust(log(revenue+1)~pct1920+log(pop), data=datb %>% filter(year>1950), fixed_effects=year+state_name, clusters=fips_long, se_type="stata", weights=pop)
mod4 <- iv_robust(log(revenue+1)~pct1920+log(pop)|coal_tons+log(pop), data=datb %>% filter(year>1950), fixed_effects=year, clusters=fips_long, se_type="stata", weights=pop)

mod5 <- lm_robust(log(spend+1)~pct1920+log(pop), data=datb %>% filter(year>1950), fixed_effects=year+state_name, clusters=fips_long, se_type="stata", weights=pop)
mod6 <- iv_robust(log(spend+1)~pct1920+log(pop)|coal_tons+log(pop), data=datb %>% filter(year>1950), fixed_effects=year, clusters=fips_long, se_type="stata", weights=pop)

# print table
rows <- tribble(~term, ~a, ~b, ~c, ~d, ~e, ~f,
                "Fixed Effects", "State Year", "State Year", "State Year", "State Year", "State Year", "State Year",
                "N Clusters", as.character(mod1$nclusters), as.character(mod2$nclusters), 
                as.character(mod3$nclusters), as.character(mod4$nclusters), as.character(mod5$nclusters),
                as.character(mod6$nclusters))
attr(rows, 'position') <- c(6:8)
modelsummary(list("Employment: OLS"=mod1, "Employment: IV"=mod2, 
                  "Revenue: OLS"=mod3, "Revenue: IV"=mod4,
                  "Spending: OLS"=mod3, "Spending: IV"=mod4), 
             output = "kableExtra", format="latex",
             coef_map = c(
               "pct1920"="Prop. in Coal: 1920",
               "log(pop)"="Log(Population Size)"),
             gof_omit="Std.Errors|AIC|BIC|RMSE",
             stars = TRUE, title = "Coal Employment and Local Government Capacity: Post-1950",
             add_rows=rows) %>%
  kable_styling(latex_options = "hold_position") 

  # A.3.2: adding property value control ####

# employment and revenue pre-1950
mod1 <- lm_robust(log(gov+1)~pct+log(pop)+log(pval), data=datb %>% filter(year<1950), fixed_effects=year+state_name, clusters=fips_long, se_type="stata", weights=pop)
mod3 <- lm_robust(log(revenue+1)~pct+log(pop)+log(pval), data=datb %>% filter(year<1950), fixed_effects=year+state_name, clusters=fips_long, se_type="stata", weights=pop)
mod2 <- iv_robust(log(gov+1)~pct+log(pop)+log(pval)|coal_tons+log(pop)+log(pval), data=datb %>% filter(year<1950), fixed_effects=year, clusters=fips_long, se_type="stata", weights=pop)
mod4 <- iv_robust(log(revenue+1)~pct+log(pop)+log(pval)|coal_tons+log(pop)+log(pval), data=datb %>% filter(year<1950), fixed_effects=year, clusters=fips_long, se_type="stata", weights=pop)

rows <- tribble(~term, ~a, ~b, ~c, ~d,
                "Fixed Effects", "State Year", "State Year", "State Year", "State Year",
                "N Clusters", as.character(mod1$nclusters), as.character(mod2$nclusters), 
                as.character(mod3$nclusters), as.character(mod4$nclusters))
attr(rows, 'position') <- c(8:10)
modelsummary(list("Employment: OLS"=mod1, "Employment: IV"=mod2, "Revenue: OLS"=mod3, "Revenue: IV"=mod4), 
             output = "kableExtra", format="latex",
             coef_map = c(
               "pct"="Prop. Labor in Coal",
               "log(pop)"="Log(Population Size)",
               "log(pval)"="Log(Property Value)"),
             gof_omit="Std.Errors|AIC|BIC|RMSE",
             stars = TRUE, title = "Coal Employment and Local Government Capacity: Pre-1950",
             add_rows=rows) %>%
  kable_styling(latex_options = "hold_position") 


# employment, revenue, and spending post-1950
mod1 <- lm_robust(log(gov+1)~pct1920+log(pop)+log(pval), data=datb %>% filter(year>1950), fixed_effects=year+state_name, clusters=fips_long, se_type="stata", weights=pop)
mod2 <- iv_robust(log(gov+1)~pct1920+log(pop)+log(pval)|coal_tons+log(pop)+log(pval), data=datb %>% filter(year>1950), fixed_effects=year, clusters=fips_long, se_type="stata", weights=pop)

mod3 <- lm_robust(log(revenue+1)~pct1920+log(pop)+log(pval), data=datb %>% filter(year>1950), fixed_effects=year+state_name, clusters=fips_long, se_type="stata", weights=pop)
mod4 <- iv_robust(log(revenue+1)~pct1920+log(pop)+log(pval)|coal_tons+log(pop)+log(pval), data=datb %>% filter(year>1950), fixed_effects=year, clusters=fips_long, se_type="stata", weights=pop)

mod5 <- lm_robust(log(spend+1)~pct1920+log(pop)+log(pval), data=datb %>% filter(year>1950), fixed_effects=year+state_name, clusters=fips_long, se_type="stata", weights=pop)
mod6 <- iv_robust(log(spend+1)~pct1920+log(pop)+log(pval)|coal_tons+log(pop)+log(pval), data=datb %>% filter(year>1950), fixed_effects=year, clusters=fips_long, se_type="stata", weights=pop)


rows <- tribble(~term, ~a, ~b, ~c, ~d, ~e, ~f,
                "Fixed Effects", "State Year", "State Year", "State Year", "State Year", "State Year", "State Year",
                "N Clusters", as.character(mod1$nclusters), as.character(mod2$nclusters), 
                as.character(mod3$nclusters), as.character(mod4$nclusters), as.character(mod5$nclusters),
                as.character(mod6$nclusters))
attr(rows, 'position') <- c(8:10)
modelsummary(list("Employment: OLS"=mod1, "Employment: IV"=mod2, 
                  "Revenue: OLS"=mod3, "Revenue: IV"=mod4,
                  "Spending: OLS"=mod3, "Spending: IV"=mod4), 
             output = "kableExtra", format="latex",
             coef_map = c(
               "pct1920"="Prop. in Coal: 1920",
               "log(pop)"="Log(Population Size)",
               "log(pval)"="Log(Property Value)"),
             gof_omit="Std.Errors|AIC|BIC|RMSE",
             stars = TRUE, title = "Coal Employment and Local Government Capacity: Post-1950",
             add_rows=rows) %>%
  kable_styling(latex_options = "hold_position") 


  # A.3.3: results by year and with spatial adjustment ####

# load relevant packages
p_load(sphet, spatialreg, spdep, sp)

# bring in longitude and latitude
datb <- left_join(datb, dat %>% select(fips_long, INTPTLAT, INTPTLONG))
# store full data 
datb_full <- datb

# function that takes year, DV; creates weight matrix and removes missing data w/in year
run_regs <- function(dv, yr){
  # specify DV
  datb_full$dv <- unlist(datb_full[,dv])
  # set year of coal employment IV and remove missings
  datb <- datb_full %>% 
    mutate(pct = case_when(yr>1940~pct1920,
                           T~pct)) %>%
    filter(!is.na(dv)&!is.na(pop)&!is.na(pct)&year==yr) %>% 
    rename(Y=INTPTLAT, X=INTPTLONG)
  
  # create spatial weights
  W_creator = function(yr){
    Coords=datb %>% select(X,Y) %>% as.matrix() 
    study_spatial=SpatialPoints(coords=Coords)
    nearest=knn2nb(knearneigh(study_spatial,k=5,longlat = T))   #k nearest neighbours
    nearest=nb2listw(nearest,style="W", zero.policy=TRUE)
    W = listw2mat(nearest)
    return(W)
  }
  
  # transform spatial weights
  WW = list(NULL)
  WW = W_creator(yr=yr)
  WWW = bdiag(WW)
  WWW = as(WWW,"matrix")
  
  # run OLS model
  mod_ols <- lm_robust(log(dv+1)~pct+log(pop), data=datb, fixed_effects=state_name, clusters=fips_long, se_type="stata")
  # run spatial OLS model
  mod_sols = spreg(log(dv+1) ~pct+log(pop)+state_name, data=datb, 
                   listw = WWW, 
                   initial.value = 0.2, model = "lag", het = TRUE, na.action=na.omit)
  # run IV model
  mod_iv <- iv_robust(log(dv+1)~pct+log(pop)|coal_tons+log(pop), data=datb, clusters=fips_long, se_type="stata")
  # run spatial IV model
  mod_siv = spreg(log(dv+1) ~log(pop)+state_name, data=datb, 
                  listw = WWW, endog =~ pct, instruments = ~ coal_tons, lag.instr = TRUE, 
                  initial.value = 0.2, model = "lag", het = TRUE, na.action=na.omit)
  # store results 
  res <- data.frame("model"=c("OLS", "SAR", "IV", "S-2SLS"),
                    "dv"=rep(dv, 4),
                    "year"=rep(yr, 4),
                    "est"=c(mod_ols$coefficients["pct"], mod_sols$coefficients["pct",], 
                            mod_iv$coefficients["pct"], mod_siv$coefficients["pct",]),
                    "ci_lo"=c(mod_ols$conf.low["pct"], 
                              mod_sols$coefficients["pct",]-1.96*sqrt(mod_sols$var["pct", "pct"]),
                              mod_iv$conf.low["pct"],
                              mod_siv$coefficients["pct",]-1.96*sqrt(mod_siv$var["pct", "pct"])),
                    "ci_hi"=c(mod_ols$conf.high["pct"], 
                              mod_sols$coefficients["pct",]+1.96*sqrt(mod_sols$var["pct", "pct"]),
                              mod_iv$conf.high["pct"],
                              mod_siv$coefficients["pct",]+1.96*sqrt(mod_siv$var["pct", "pct"])))
  return(res)
}

# run for each year
res_tab <- run_regs(dv="gov", yr=1880)
res_tab <- bind_rows(res_tab, run_regs(dv="gov", yr=1900))
res_tab <- bind_rows(res_tab, run_regs(dv="gov", yr=1910))
res_tab <- bind_rows(res_tab, run_regs(dv="gov", yr=1920))
res_tab <- bind_rows(res_tab, run_regs(dv="gov", yr=1930))
res_tab <- bind_rows(res_tab, run_regs(dv="gov", yr=1940))

res_tab <- bind_rows(res_tab, run_regs(dv="revenue", yr=1880))
res_tab <- bind_rows(res_tab, run_regs(dv="revenue", yr=1900))
res_tab <- bind_rows(res_tab, run_regs(dv="revenue", yr=1910))
res_tab <- bind_rows(res_tab, run_regs(dv="revenue", yr=1920))
res_tab <- bind_rows(res_tab, run_regs(dv="revenue", yr=1930))

# nice labels
res_tab <- res_tab %>%
  mutate(dv = recode(dv, "gov"="Employment", "revenue"="Revenue")) %>%
  mutate(Model=case_when(model %in% c("OLS", "SAR")~"Naive",
                         model %in% c("IV", "S-2SLS")~"IV")) %>%
  mutate(Spatial=case_when(model %in% c("OLS", "IV")~"No",
                           model %in% c("SAR", "S-2SLS")~"Yes"))

# plot
dodge <- position_dodge(width=4)
a <- res_tab %>%
  ggplot(aes(x=year, y=est, shape=Spatial, color=Model)) +
  geom_point(position=dodge) +
  geom_errorbar(aes(x=year,
                    ymin=ci_lo,
                    ymax=ci_hi),
                position=dodge) +
  theme_bw() + facet_wrap(~dv) +
  xlab("Year") + ylab("Estimate") +
  ggtitle("Change in Government Capacity: Additional Models, pre-1950") +
  theme(text=element_text(size=15), plot.title = element_text(hjust = 0.5)) +
  geom_hline(yintercept=0, linetype=2) + scale_color_manual(name="Model", values=c("black", 'gray80'))

# repeat for later years 
res_tab <- run_regs(dv="gov", yr=1957)
res_tab <- bind_rows(res_tab, run_regs(dv="gov", yr=1962))
res_tab <- bind_rows(res_tab, run_regs(dv="gov", yr=1967))
res_tab <- bind_rows(res_tab, run_regs(dv="gov", yr=1972))
res_tab <- bind_rows(res_tab, run_regs(dv="gov", yr=1977))
res_tab <- bind_rows(res_tab, run_regs(dv="gov", yr=1982))
res_tab <- bind_rows(res_tab, run_regs(dv="gov", yr=1987))
res_tab <- bind_rows(res_tab, run_regs(dv="gov", yr=1992))
res_tab <- bind_rows(res_tab, run_regs(dv="gov", yr=1997))

res_tab <- bind_rows(res_tab, run_regs(dv="revenue", yr=1957))
res_tab <- bind_rows(res_tab, run_regs(dv="revenue", yr=1962))
res_tab <- bind_rows(res_tab, run_regs(dv="revenue", yr=1967))
res_tab <- bind_rows(res_tab, run_regs(dv="revenue", yr=1972))
res_tab <- bind_rows(res_tab, run_regs(dv="revenue", yr=1977))
res_tab <- bind_rows(res_tab, run_regs(dv="revenue", yr=1982))
res_tab <- bind_rows(res_tab, run_regs(dv="revenue", yr=1987))
res_tab <- bind_rows(res_tab, run_regs(dv="revenue", yr=1992))
res_tab <- bind_rows(res_tab, run_regs(dv="revenue", yr=1997))

res_tab <- bind_rows(res_tab, run_regs(dv="spend", yr=1957))
res_tab <- bind_rows(res_tab, run_regs(dv="spend", yr=1972))
res_tab <- bind_rows(res_tab, run_regs(dv="spend", yr=1977))
res_tab <- bind_rows(res_tab, run_regs(dv="spend", yr=1982))
res_tab <- bind_rows(res_tab, run_regs(dv="spend", yr=1992))

# nice labels
res_tab <- res_tab %>%
  mutate(Model=factor(model, levels=c("OLS", "SAR", "IV", "S-2SLS", ordered=TRUE))) %>%
  mutate(dv = recode(dv, "gov"="Employment", "revenue"="Revenue", "spend"="Spending")) %>%
  mutate(Model=case_when(model %in% c("OLS", "SAR")~"Naive",
                         model %in% c("IV", "S-2SLS")~"IV")) %>%
  mutate(Spatial=case_when(model %in% c("OLS", "IV")~"No",
                           model %in% c("SAR", "S-2SLS")~"Yes"))
# plot results
dodge <- position_dodge(width=4)
b <- res_tab %>%
  ggplot(aes(x=year, y=est, shape=Spatial, color=Model)) +
  geom_point(position=dodge) +
  geom_errorbar(aes(x=year,
                    ymin=ci_lo,
                    ymax=ci_hi),
                position=dodge) +
  theme_bw() + facet_wrap(~dv) +
  xlab("Year") + ylab("Estimate") +
  ggtitle("Change in Government Capacity: Additional Models, post-1950") +
  theme(text=element_text(size=15), plot.title = element_text(hjust = 0.5)) +
  geom_hline(yintercept=0, linetype=2) + scale_color_manual(name="Model", values=c("black", 'gray80'))

# print plot
grid.arrange(a,b)


  # A.3.4: results without population weights ####

# employment
  # pre-1950, OLS
mod1 <- lm_robust(log(gov+1)~pct+log(pop), data=datb %>% filter(year<1950), fixed_effects=year+state_name, clusters=fips_long, se_type="stata")
res <- data.frame("Outcome"="Employment",
                  "Period"="Pre-1950",
                  "Model"="OLS", 
                  "Estimate"=mod1$coefficients[1],
                  "ci_lo"=mod1$conf.low[1],
                  "ci_hi"=mod1$conf.high[1])
  # pre-1950, IV
mod1 <- iv_robust(log(gov+1)~pct+log(pop)|coal_tons+log(pop), data=datb %>% filter(year<1950), fixed_effects=year, clusters=fips_long, se_type="stata")
res <- bind_rows(res, data.frame("Outcome"="Employment",
                                 "Period"="Pre-1950",
                                 "Model"="IV", 
                                 "Estimate"=mod1$coefficients[1],
                                 "ci_lo"=mod1$conf.low[1],
                                 "ci_hi"=mod1$conf.high[1]))
  # post-1950, OLS
mod1 <- lm_robust(log(gov+1)~pct1920+log(pop), data=datb %>% filter(year>1950), fixed_effects=year+state_name, clusters=fips_long, se_type="stata")
res <- bind_rows(res, data.frame("Outcome"="Employment",
                                 "Period"="Post-1950",
                                 "Model"="OLS", 
                                 "Estimate"=mod1$coefficients[1],
                                 "ci_lo"=mod1$conf.low[1],
                                 "ci_hi"=mod1$conf.high[1]))
  # post-1950, IV
mod1 <- iv_robust(log(gov+1)~pct1920+log(pop)|coal_tons+log(pop), data=datb %>% filter(year>1950), fixed_effects=year, clusters=fips_long, se_type="stata")
res <- bind_rows(res, data.frame("Outcome"="Employment",
                                 "Period"="Post-1950",
                                 "Model"="IV", 
                                 "Estimate"=mod1$coefficients[1],
                                 "ci_lo"=mod1$conf.low[1],
                                 "ci_hi"=mod1$conf.high[1]))

# repeat for revenue and spending

  # revenue
mod1 <- lm_robust(log(revenue+1)~pct+log(pop), data=datb %>% filter(year<1950), fixed_effects=year+state_name, clusters=fips_long, se_type="stata")
res <- bind_rows(res, data.frame("Outcome"="Revenue",
                                 "Period"="Pre-1950",
                                 "Model"="OLS", 
                                 "Estimate"=mod1$coefficients[1],
                                 "ci_lo"=mod1$conf.low[1],
                                 "ci_hi"=mod1$conf.high[1]))
mod1 <- iv_robust(log(revenue+1)~pct+log(pop)|coal_tons+log(pop), data=datb %>% filter(year<1950), fixed_effects=year, clusters=fips_long, se_type="stata")
res <- bind_rows(res, data.frame("Outcome"="Revenue",
                                 "Period"="Pre-1950",
                                 "Model"="IV", 
                                 "Estimate"=mod1$coefficients[1],
                                 "ci_lo"=mod1$conf.low[1],
                                 "ci_hi"=mod1$conf.high[1]))
mod1 <- lm_robust(log(revenue+1)~pct1920+log(pop), data=datb %>% filter(year>1950), fixed_effects=year+state_name, clusters=fips_long, se_type="stata")
res <- bind_rows(res, data.frame("Outcome"="Revenue",
                                 "Period"="Post-1950",
                                 "Model"="OLS", 
                                 "Estimate"=mod1$coefficients[1],
                                 "ci_lo"=mod1$conf.low[1],
                                 "ci_hi"=mod1$conf.high[1]))
mod1 <- iv_robust(log(revenue+1)~pct1920+log(pop)|coal_tons+log(pop), data=datb %>% filter(year>1950), fixed_effects=year, clusters=fips_long, se_type="stata")
res <- bind_rows(res, data.frame("Outcome"="Revenue",
                                 "Period"="Post-1950",
                                 "Model"="IV", 
                                 "Estimate"=mod1$coefficients[1],
                                 "ci_lo"=mod1$conf.low[1],
                                 "ci_hi"=mod1$conf.high[1]))

  # spending
mod1 <- lm_robust(log(spend+1)~pct1920+log(pop), data=datb %>% filter(year>1950), fixed_effects=year+state_name, clusters=fips_long, se_type="stata")
res <- bind_rows(res, data.frame("Outcome"="Spending",
                                 "Period"="Post-1950",
                                 "Model"="OLS", 
                                 "Estimate"=mod1$coefficients[1],
                                 "ci_lo"=mod1$conf.low[1],
                                 "ci_hi"=mod1$conf.high[1]))
mod1 <- iv_robust(log(spend+1)~pct1920+log(pop)|coal_tons+log(pop), data=datb %>% filter(year>1950), fixed_effects=year, clusters=fips_long, se_type="stata")
res <- bind_rows(res, data.frame("Outcome"="Spending",
                                 "Period"="Post-1950",
                                 "Model"="IV", 
                                 "Estimate"=mod1$coefficients[1],
                                 "ci_lo"=mod1$conf.low[1],
                                 "ci_hi"=mod1$conf.high[1]))


# plot results 
dodge <- position_dodge(width=.2)
res %>%
  mutate(Period = factor(Period, levels=c("Pre-1950", "Post-1950"), ordered=TRUE)) %>%
  ggplot() +
  geom_point(aes(x=Outcome, y=Estimate, color=Model), position=dodge) +
  geom_errorbar(aes(x=Outcome, ymin=ci_lo, ymax=ci_hi, color=Model), position=dodge) +
  facet_wrap(~Period, scales="free_x") + theme_bw() +
  theme(text=element_text(size=15), plot.title = element_text(hjust = 0.5)) +
  geom_hline(yintercept=0, linetype=2) + 
  scale_color_manual(name="Model", values=c("black", "gray70"))


